

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta name="Description" content="scikit-learn: machine learning in Python">

  
  <title>sklearn.datasets.load_svmlight_file &mdash; scikit-learn 0.22 documentation</title>
  
  <link rel="canonical" href="http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html" />

  
  <link rel="shortcut icon" href="../../_static/favicon.ico"/>
  

  <link rel="stylesheet" href="../../_static/css/vendor/bootstrap.min.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/gallery.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
<script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script> 
</head>
<body>
<nav id="navbar" class="sk-docs-navbar navbar navbar-expand-md navbar-light bg-light py-0">
  <div class="container-fluid sk-docs-container px-0">
      <a class="navbar-brand py-0" href="../../index.html">
        <img
          class="sk-brand-img"
          src="../../_static/scikit-learn-logo-small.png"
          alt="logo"/>
      </a>
    <button
      id="sk-navbar-toggler"
      class="navbar-toggler"
      type="button"
      data-toggle="collapse"
      data-target="#navbarSupportedContent"
      aria-controls="navbarSupportedContent"
      aria-expanded="false"
      aria-label="Toggle navigation"
    >
      <span class="navbar-toggler-icon"></span>
    </button>

    <div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
      <ul class="navbar-nav mr-auto">
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../install.html">Install</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../user_guide.html">User Guide</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../classes.html">API</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../auto_examples/index.html">Examples</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../getting_started.html">Getting Started</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../tutorial/index.html">Tutorial</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../glossary.html">Glossary</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../developers/index.html">Development</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../faq.html">FAQ</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../related_projects.html">Related packages</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../roadmap.html">Roadmap</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../about.html">About us</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
        </li>
        <li class="nav-item dropdown nav-more-item-dropdown">
          <a class="sk-nav-link nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
          <div class="dropdown-menu" aria-labelledby="navbarDropdown">
              <a class="sk-nav-dropdown-item dropdown-item" href="../../getting_started.html">Getting Started</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../tutorial/index.html">Tutorial</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../glossary.html">Glossary</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../developers/index.html">Development</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../faq.html">FAQ</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../related_projects.html">Related packages</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../roadmap.html">Roadmap</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../about.html">About us</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
          </div>
        </li>
      </ul>
      <div id="searchbox" role="search">
          <div class="searchformwrapper">
          <form class="search" action="../../search.html" method="get">
            <input class="sk-search-text-input" type="text" name="q" aria-labelledby="searchlabel" />
            <input class="sk-search-text-btn" type="submit" value="Go" />
          </form>
          </div>
      </div>
    </div>
  </div>
</nav>
<div class="d-flex" id="sk-doc-wrapper">
    <input type="checkbox" name="sk-toggle-checkbox" id="sk-toggle-checkbox">
    <label id="sk-sidemenu-toggle" class="sk-btn-toggle-toc btn sk-btn-primary" for="sk-toggle-checkbox">Toggle Menu</label>
    <div id="sk-sidebar-wrapper" class="border-right">
      <div class="sk-sidebar-toc-wrapper">
        <div class="sk-sidebar-toc-logo">
          <a href="../../index.html">
            <img
              class="sk-brand-img"
              src="../../_static/scikit-learn-logo-small.png"
              alt="logo"/>
          </a>
        </div>
        <div class="btn-group w-100 mb-2" role="group" aria-label="rellinks">
            <a href="sklearn.datasets.load_sample_images.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="sklearn.datasets.load_sample_images">Prev</a><a href="../classes.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="API Reference">Up</a>
            <a href="sklearn.datasets.load_svmlight_files.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="sklearn.datasets.load_svmlight_files">Next</a>
        </div>
        <div class="alert alert-danger p-1 mb-2" role="alert">
          <p class="text-center mb-0">
          <strong>scikit-learn 0.22</strong><br/>
          <a href="http://scikit-learn.org/dev/versions.html">Other versions</a>
          </p>
        </div>
        <div class="alert alert-warning p-1 mb-2" role="alert">
          <p class="text-center mb-0">
            Please <a class="font-weight-bold" href="../../about.html#citing-scikit-learn"><string>cite us</string></a> if you use the software.
          </p>
        </div>
          <div class="sk-sidebar-toc">
            <ul>
<li><a class="reference internal" href="#"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.datasets</span></code>.load_svmlight_file</a></li>
</ul>

          </div>
      </div>
    </div>
    <div id="sk-page-content-wrapper">
      <div class="sk-page-content container-fluid body px-md-3" role="main">
        
  <div class="section" id="sklearn-datasets-load-svmlight-file">
<h1><a class="reference internal" href="../classes.html#module-sklearn.datasets" title="sklearn.datasets"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.datasets</span></code></a>.load_svmlight_file<a class="headerlink" href="#sklearn-datasets-load-svmlight-file" title="Permalink to this headline">¶</a></h1>
<dl class="function">
<dt id="sklearn.datasets.load_svmlight_file">
<code class="sig-prename descclassname">sklearn.datasets.</code><code class="sig-name descname">load_svmlight_file</code><span class="sig-paren">(</span><em class="sig-param">f</em>, <em class="sig-param">n_features=None</em>, <em class="sig-param">dtype=&lt;class 'numpy.float64'&gt;</em>, <em class="sig-param">multilabel=False</em>, <em class="sig-param">zero_based='auto'</em>, <em class="sig-param">query_id=False</em>, <em class="sig-param">offset=0</em>, <em class="sig-param">length=-1</em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/scikit-learn/scikit-learn/blob/5f3c3f037/sklearn/datasets/_svmlight_format.py#L40"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#sklearn.datasets.load_svmlight_file" title="Permalink to this definition">¶</a></dt>
<dd><p>Load datasets in the svmlight / libsvm format into sparse CSR matrix</p>
<p>This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.</p>
<p>The first element of each line can be used to store a target variable
to predict.</p>
<p>This format is used as the default format for both svmlight and the
libsvm command line programs.</p>
<p>Parsing a text based source can be expensive. When working on
repeatedly on the same dataset, it is recommended to wrap this
loader with joblib.Memory.cache to store a memmapped backup of the
CSR results of the first call and benefit from the near instantaneous
loading of memmapped structures for the subsequent calls.</p>
<p>In case the file contains a pairwise preference constraint (known
as “qid” in the svmlight format) these are ignored unless the
query_id parameter is set to True. These pairwise preference
constraints can be used to constraint the combination of samples
when using pairwise loss functions (as is the case in some
learning to rank problems) so that only pairs with the same
query_id value are considered.</p>
<p>This implementation is written in Cython and is reasonably fast.
However, a faster API-compatible loader is also available at:</p>
<blockquote>
<div><p><a class="reference external" href="https://github.com/mblondel/svmlight-loader">https://github.com/mblondel/svmlight-loader</a></p>
</div></blockquote>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl class="simple">
<dt><strong>f</strong><span class="classifier">{str, file-like, int}</span></dt><dd><p>(Path to) a file to load. If a path ends in “.gz” or “.bz2”, it will
be uncompressed on the fly. If an integer is passed, it is assumed to
be a file descriptor. A file-like or file descriptor will not be closed
by this function. A file-like object must be opened in binary mode.</p>
</dd>
<dt><strong>n_features</strong><span class="classifier">int or None</span></dt><dd><p>The number of features to use. If None, it will be inferred. This
argument is useful to load several files that are subsets of a
bigger sliced dataset: each subset might not have examples of
every feature, hence the inferred shape might vary from one
slice to another.
n_features is only required if <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> are passed a
non-default value.</p>
</dd>
<dt><strong>dtype</strong><span class="classifier">numpy data type, default np.float64</span></dt><dd><p>Data type of dataset to be loaded. This will be the data type of the
output numpy arrays <code class="docutils literal notranslate"><span class="pre">X</span></code> and <code class="docutils literal notranslate"><span class="pre">y</span></code>.</p>
</dd>
<dt><strong>multilabel</strong><span class="classifier">boolean, optional, default False</span></dt><dd><p>Samples may have several labels each (see
<a class="reference external" href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html">https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html</a>)</p>
</dd>
<dt><strong>zero_based</strong><span class="classifier">boolean or “auto”, optional, default “auto”</span></dt><dd><p>Whether column indices in f are zero-based (True) or one-based
(False). If column indices are one-based, they are transformed to
zero-based to match Python/NumPy conventions.
If set to “auto”, a heuristic check is applied to determine this from
the file contents. Both kinds of files occur “in the wild”, but they
are unfortunately not self-identifying. Using “auto” or True should
always be safe when no <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> is passed.
If <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> are passed, the “auto” mode falls back
to <code class="docutils literal notranslate"><span class="pre">zero_based=True</span></code> to avoid having the heuristic check yield
inconsistent results on different segments of the file.</p>
</dd>
<dt><strong>query_id</strong><span class="classifier">boolean, default False</span></dt><dd><p>If True, will return the query_id array for each file.</p>
</dd>
<dt><strong>offset</strong><span class="classifier">integer, optional, default 0</span></dt><dd><p>Ignore the offset first bytes by seeking forward, then
discarding the following bytes up until the next new line
character.</p>
</dd>
<dt><strong>length</strong><span class="classifier">integer, optional, default -1</span></dt><dd><p>If strictly positive, stop reading any new line of data once the
position in the file has reached the (offset + length) bytes threshold.</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt><strong>X</strong><span class="classifier">scipy.sparse matrix of shape (n_samples, n_features)</span></dt><dd></dd>
<dt><strong>y</strong><span class="classifier">ndarray of shape (n_samples,), or, in the multilabel a list of</span></dt><dd><p>tuples of length n_samples.</p>
</dd>
<dt><strong>query_id</strong><span class="classifier">array of shape (n_samples,)</span></dt><dd><p>query_id for each sample. Only returned when query_id is set to
True.</p>
</dd>
</dl>
</dd>
</dl>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<dl class="simple">
<dt><a class="reference internal" href="sklearn.datasets.load_svmlight_files.html#sklearn.datasets.load_svmlight_files" title="sklearn.datasets.load_svmlight_files"><code class="xref py py-obj docutils literal notranslate"><span class="pre">load_svmlight_files</span></code></a></dt><dd><p>similar function for loading multiple files in this format, enforcing the same number of features/columns on all of them.</p>
</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p>To use joblib.Memory to cache the svmlight file:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">joblib</span> <span class="kn">import</span> <span class="n">Memory</span>
<span class="kn">from</span> <span class="nn">.datasets</span> <span class="kn">import</span> <span class="n">load_svmlight_file</span>
<span class="n">mem</span> <span class="o">=</span> <span class="n">Memory</span><span class="p">(</span><span class="s2">&quot;./mycache&quot;</span><span class="p">)</span>

<span class="nd">@mem</span><span class="o">.</span><span class="n">cache</span>
<span class="k">def</span> <span class="nf">get_data</span><span class="p">():</span>
    <span class="n">data</span> <span class="o">=</span> <span class="n">load_svmlight_file</span><span class="p">(</span><span class="s2">&quot;mysvmlightfile&quot;</span><span class="p">)</span>
    <span class="k">return</span> <span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">data</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>

<span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">get_data</span><span class="p">()</span>
</pre></div>
</div>
</dd></dl>

<div class="clearer"></div></div>


      </div>
    <div class="container">
      <footer class="sk-content-footer">
            &copy; 2007 - 2019, scikit-learn developers (BSD License).
          <a href="../../_sources/modules/generated/sklearn.datasets.load_svmlight_file.rst.txt" rel="nofollow">Show this page source</a>
      </footer>
    </div>
  </div>
</div>
<script src="../../_static/js/vendor/bootstrap.min.js"></script>

<script>
    window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
    ga('create', 'UA-22606712-2', 'auto');
    ga('set', 'anonymizeIp', true);
    ga('send', 'pageview');
</script>
<script async src='https://www.google-analytics.com/analytics.js'></script>


<script>
$(document).ready(function() {
    /* Add a [>>>] button on the top-right corner of code samples to hide
     * the >>> and ... prompts and the output and thus make the code
     * copyable. */
    var div = $('.highlight-python .highlight,' +
                '.highlight-python3 .highlight,' +
                '.highlight-pycon .highlight,' +
		'.highlight-default .highlight')
    var pre = div.find('pre');

    // get the styles from the current theme
    pre.parent().parent().css('position', 'relative');
    var hide_text = 'Hide prompts and outputs';
    var show_text = 'Show prompts and outputs';

    // create and add the button to all the code blocks that contain >>>
    div.each(function(index) {
        var jthis = $(this);
        if (jthis.find('.gp').length > 0) {
            var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
            jthis.prepend(button);
        }
        // tracebacks (.gt) contain bare text elements that need to be
        // wrapped in a span to work with .nextUntil() (see later)
        jthis.find('pre:has(.gt)').contents().filter(function() {
            return ((this.nodeType == 3) && (this.data.trim().length > 0));
        }).wrap('<span>');
    });

    // define the behavior of the button when it's clicked
    $('.copybutton').click(function(e){
        e.preventDefault();
        var button = $(this);
        if (button.data('hidden') === 'false') {
            // hide the code output
            button.parent().find('.go, .gp, .gt').hide();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
            button.css('text-decoration', 'line-through');
            button.attr('title', show_text);
            button.data('hidden', 'true');
        } else {
            // show the code output
            button.parent().find('.go, .gp, .gt').show();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
            button.css('text-decoration', 'none');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
        }
    });

	/*** Add permalink buttons next to glossary terms ***/
	$('dl.glossary > dt[id]').append(function() {
		return ('<a class="headerlink" href="#' +
			    this.getAttribute('id') +
			    '" title="Permalink to this term">¶</a>');
	});
  /*** Hide navbar when scrolling down ***/
  // Returns true when headerlink target matches hash in url
  (function() {
    hashTargetOnTop = function() {
        var hash = window.location.hash;
        if ( hash.length < 2 ) { return false; }

        var target = document.getElementById( hash.slice(1) );
        if ( target === null ) { return false; }

        var top = target.getBoundingClientRect().top;
        return (top < 2) && (top > -2);
    };

    // Hide navbar on load if hash target is on top
    var navBar = document.getElementById("navbar");
    var navBarToggler = document.getElementById("sk-navbar-toggler");
    var navBarHeightHidden = "-" + navBar.getBoundingClientRect().height + "px";
    var $window = $(window);

    hideNavBar = function() {
        navBar.style.top = navBarHeightHidden;
    };

    showNavBar = function() {
        navBar.style.top = "0";
    }

    if (hashTargetOnTop()) {
        hideNavBar()
    }

    var prevScrollpos = window.pageYOffset;
    hideOnScroll = function(lastScrollTop) {
        if (($window.width() < 768) && (navBarToggler.getAttribute("aria-expanded") === 'true')) {
            return;
        }
        if (lastScrollTop > 2 && (prevScrollpos <= lastScrollTop) || hashTargetOnTop()){
            hideNavBar()
        } else {
            showNavBar()
        }
        prevScrollpos = lastScrollTop;
    };

    /*** high preformance scroll event listener***/
    var raf = window.requestAnimationFrame ||
        window.webkitRequestAnimationFrame ||
        window.mozRequestAnimationFrame ||
        window.msRequestAnimationFrame ||
        window.oRequestAnimationFrame;
    var lastScrollTop = $window.scrollTop();

    if (raf) {
        loop();
    }

    function loop() {
        var scrollTop = $window.scrollTop();
        if (lastScrollTop === scrollTop) {
            raf(loop);
            return;
        } else {
            lastScrollTop = scrollTop;
            hideOnScroll(lastScrollTop);
            raf(loop);
        }
    }
  })();
});

</script>
    
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
    
</body>
</html>